In [1]:
import pandas as pd
import numpy as np
from sklearn import * 
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
df = pd.read_csv("/data/credit-default.csv")
df.head()


Out[2]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_length installment_rate personal_status other_debtors ... property age installment_plan housing existing_credits default dependents telephone foreign_worker job
0 < 0 DM 6 critical radio/tv 1169 unknown > 7 yrs 4 single male none ... real estate 67 none own 2 1 1 yes yes skilled employee
1 1 - 200 DM 48 repaid radio/tv 5951 < 100 DM 1 - 4 yrs 2 female none ... real estate 22 none own 1 2 1 none yes skilled employee
2 unknown 12 critical education 2096 < 100 DM 4 - 7 yrs 2 single male none ... real estate 49 none own 1 1 2 none yes unskilled resident
3 < 0 DM 42 repaid furniture 7882 < 100 DM 4 - 7 yrs 2 single male guarantor ... building society savings 45 none for free 1 1 2 none yes skilled employee
4 < 0 DM 24 delayed car (new) 4870 < 100 DM 1 - 4 yrs 3 single male none ... unknown/none 53 none for free 2 2 2 none yes skilled employee

5 rows × 21 columns


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_length       1000 non-null object
installment_rate        1000 non-null int64
personal_status         1000 non-null object
other_debtors           1000 non-null object
residence_history       1000 non-null int64
property                1000 non-null object
age                     1000 non-null int64
installment_plan        1000 non-null object
housing                 1000 non-null object
existing_credits        1000 non-null int64
default                 1000 non-null int64
dependents              1000 non-null int64
telephone               1000 non-null object
foreign_worker          1000 non-null object
job                     1000 non-null object
dtypes: int64(8), object(13)
memory usage: 164.1+ KB

In [4]:
df.default.value_counts()


Out[4]:
1    700
2    300
Name: default, dtype: int64

In [5]:
target = "default"
label_encoder = preprocessing.LabelEncoder()

y = label_encoder.fit_transform(df[target])
X = df.drop(columns=[target])
X.head()


Out[5]:
checking_balance months_loan_duration credit_history purpose amount savings_balance employment_length installment_rate personal_status other_debtors residence_history property age installment_plan housing existing_credits dependents telephone foreign_worker job
0 < 0 DM 6 critical radio/tv 1169 unknown > 7 yrs 4 single male none 4 real estate 67 none own 2 1 yes yes skilled employee
1 1 - 200 DM 48 repaid radio/tv 5951 < 100 DM 1 - 4 yrs 2 female none 2 real estate 22 none own 1 1 none yes skilled employee
2 unknown 12 critical education 2096 < 100 DM 4 - 7 yrs 2 single male none 3 real estate 49 none own 1 2 none yes unskilled resident
3 < 0 DM 42 repaid furniture 7882 < 100 DM 4 - 7 yrs 2 single male guarantor 4 building society savings 45 none for free 1 2 none yes skilled employee
4 < 0 DM 24 delayed car (new) 4870 < 100 DM 1 - 4 yrs 3 single male none 4 unknown/none 53 none for free 2 2 none yes skilled employee

In [6]:
cat_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] == "object"]
cat_columns


Out[6]:
['checking_balance',
 'credit_history',
 'purpose',
 'savings_balance',
 'employment_length',
 'personal_status',
 'other_debtors',
 'property',
 'installment_plan',
 'housing',
 'telephone',
 'foreign_worker',
 'job']

In [7]:
num_columns = [field for field in dict(X.dtypes) 
               if X.dtypes[field] != "object"]
num_columns


Out[7]:
['months_loan_duration',
 'amount',
 'installment_rate',
 'residence_history',
 'age',
 'existing_credits',
 'dependents']

In [8]:
cat_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='constant'
                                     , fill_value='missing')),
    ('onehot', preprocessing.OneHotEncoder(handle_unknown='error'
                                           , drop="first"))
]) 

num_pipe = pipeline.Pipeline([
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('poly', preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ('scaler', preprocessing.StandardScaler()),
])

preprocessing_pipe = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns)
])

Simple logistic regression


In [9]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", linear_model.LogisticRegression(random_state=1
                                , solver="liblinear"))
])


param_grid = {
    "est__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
Best score:  0.755 Best parameters:  {'est__C': 1.557903180725127}
[Parallel(n_jobs=8)]: Done  35 out of  50 | elapsed:    1.5s remaining:    0.7s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.6s finished

Ensemble Classifier


In [10]:
log_clf = linear_model.LogisticRegression(C = 1.53
                            , solver= "liblinear", random_state=1) 
rnd_clf = ensemble.RandomForestClassifier(max_depth=6
                            , n_estimators = 30, random_state=1) 
svm_clf = svm.SVC(C = 1.0, gamma = 0.15, random_state=1) 


estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.VotingClassifier(voting="hard", estimators=
                                      [('lr', log_clf), 
                                       ('rf', rnd_clf), 
                                       ('svm', svm_clf)
                                      ])
    )
])


param_grid = {
    "est__svm__C": np.linspace(1.0, 20, 10)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                    , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score:  0.765 Best parameters:  {'est__svm__C': 5.222222222222222}
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished

AdaBoost Classifier


In [11]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.AdaBoostClassifier(
          linear_model.LogisticRegression(random_state=1
                                          , solver="liblinear")
        , n_estimators=200
        , algorithm="SAMME.R"
        , learning_rate=0.051)

    )
])


param_grid = {
    "est__base_estimator__C": np.random.random(10) + 1
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                                , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    3.7s finished
Best score:  0.734 Best parameters:  {'est__base_estimator__C': 1.0258494070869997}

Bagging classifier


In [12]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.BaggingClassifier(
                tree.DecisionTreeClassifier(), 
                max_samples= 0.5,
                n_estimators=50,
                bootstrap=True, 
                oob_score=True)
    )
])


param_grid = {
    "est__base_estimator__max_depth": np.arange(5, 15)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_, "Best parameters: ", gsearch.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
Best score:  0.757 Best parameters:  {'est__base_estimator__max_depth': 12}
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    1.1s finished

Gradient Boosted Model


In [18]:
estimator_pipe = pipeline.Pipeline([
    ("preprocessing", preprocessing_pipe),
    ("est", ensemble.GradientBoostingClassifier(random_state=1))
])


param_grid = {
    "est__max_depth": np.arange(3, 10),
    "est__learning_rate": np.linspace(0.01, 1, 10)
}

gsearch = model_selection.GridSearchCV(estimator_pipe, param_grid, cv = 5
                        , verbose=1, n_jobs=8, scoring="accuracy")

gsearch.fit(X, y)

print("Best score: ", gsearch.best_score_
        , "Best parameters: ", gsearch.best_params_)


Fitting 5 folds for each of 70 candidates, totalling 350 fits
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done 205 tasks      | elapsed:   14.8s
Best score:  0.76 Best parameters:  {'est__learning_rate': 0.12, 'est__max_depth': 3}
[Parallel(n_jobs=8)]: Done 350 out of 350 | elapsed:   20.8s finished

In [23]:
scores = pd.DataFrame(gsearch.cv_results_)
scores.head()


Out[23]:
mean_fit_time std_fit_time mean_score_time std_score_time param_est__learning_rate param_est__max_depth params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 0.178265 0.008670 0.006912 0.001198 0.01 3 {'est__learning_rate': 0.01, 'est__max_depth': 3} 0.705 0.705 0.710 0.685 0.705 0.702 0.008718 70
1 0.315358 0.017413 0.006609 0.001425 0.01 4 {'est__learning_rate': 0.01, 'est__max_depth': 4} 0.730 0.740 0.720 0.700 0.710 0.720 0.014142 67
2 0.482575 0.019085 0.008350 0.000537 0.01 5 {'est__learning_rate': 0.01, 'est__max_depth': 5} 0.725 0.750 0.715 0.730 0.730 0.730 0.011402 60
3 0.660290 0.010609 0.006561 0.001300 0.01 6 {'est__learning_rate': 0.01, 'est__max_depth': 6} 0.730 0.775 0.725 0.730 0.720 0.736 0.019849 47
4 0.838601 0.033753 0.006765 0.000963 0.01 7 {'est__learning_rate': 0.01, 'est__max_depth': 7} 0.755 0.765 0.700 0.725 0.725 0.734 0.023324 52

In [24]:
scores[scores.rank_test_score == 1]


Out[24]:
mean_fit_time std_fit_time mean_score_time std_score_time param_est__learning_rate param_est__max_depth params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
7 0.190787 0.010879 0.007576 0.000844 0.12 3 {'est__learning_rate': 0.12, 'est__max_depth': 3} 0.755 0.765 0.785 0.745 0.75 0.76 0.014142 1

In [ ]: